Pathway activity inference for multiple samples

This tutorial demonstrates how to use the FeaSc software for multi-sample pathway activity inference analysis. FeaSc is a Python package specifically designed for pathway activity analysis in single-cell RNA sequencing data, supporting multiple dimensionality reduction methods and batch effect correction. When analyzing multiple samples with batch effects, there are two main approaches: (1) ignore batch effects and perform pathway activity inference as done for single samples by integrating results from classical dimensionality reduction methods such as PCA, NMF and MCA, or (2) use scVI as a preprocessing step for batch effect correction.

Importing packages

import warnings
warnings.filterwarnings("ignore")
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)

import scanpy as sc
import numpy as np
import pandas as pd
import anndata as ad
import scipy
import scvi
from numpy import unique
import FeaSc as fsc

import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.metrics import roc_curve, auc

%matplotlib inline
%config InlineBackend.figure_format = 'retina'
plt.rcParams['figure.figsize'] = [6, 4]
plt.rcParams['figure.dpi'] = 100
def plot_roc_curve(labels, scores, target_celltype, method_name="Method", 
                   figsize=(6, 5), show_plot=True):
    
    y_true = (labels == target_celltype).astype(int)
    y_score = scores
    
    fpr, tpr, thresholds = roc_curve(y_true, y_score)
    roc_auc = auc(fpr, tpr)
    
    plt.figure(figsize=figsize)
    plt.plot(fpr, tpr, linewidth=2, 
             label=f'{method_name} (AUC = {roc_auc:.3f})')
    plt.plot([0, 1], [0, 1], linestyle='--', color='gray', alpha=0.7)
    
    plt.xlabel('False Positive Rate', fontsize=12)
    plt.ylabel('True Positive Rate', fontsize=12)
    plt.title(f'ROC Curve for {target_celltype} Detection\n({method_name})', 
              fontsize=14)
    plt.legend(loc='lower right', fontsize=11)
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()
    
    # 返回结果
    return {
        'fpr': fpr,
        'tpr': tpr,
        'auc': roc_auc,
        'thresholds': thresholds
    }

loading data set

In this section, we load and preprocess single-cell RNA sequencing data from the GSE96583 dataset, which contains immune cells under two conditions: with and without interferon stimulation.

adata = sc.read_h5ad('data/h5ad/GSE96583_IFN.h5ad')
sc.pp.filter_genes(adata, min_cells=3)
sc.pp.filter_cells(adata, min_genes=200)
sc.pp.highly_variable_genes(adata, n_top_genes=3000, flavor="seurat_v3")
adata = adata[:, adata.var.highly_variable]
adata.layers['counts'] = adata.X.copy()
adata
AnnData object with n_obs × n_vars = 28871 × 3000
    obs: 'tsne1', 'tsne2', 'ind', 'stim', 'cluster', 'cell', 'multiplets', 'n_genes'
    var: 'n_cells', 'highly_variable', 'highly_variable_rank', 'means', 'variances', 'variances_norm'
    uns: 'hvg'
    layers: 'counts'
adata.obs
tsne1 tsne2 ind stim cluster cell multiplets n_genes
cell_id
AAACATACAATGCC-1 -4.277833 -19.294709 107 ctrl 5 CD4 T cells doublet 852
AAACATACATTTCC-1 -27.640373 14.966629 1016 ctrl 9 CD14+ Monocytes singlet 878
AAACATACCAGAAA-1 -27.493646 28.924885 1256 ctrl 9 CD14+ Monocytes singlet 713
AAACATACCAGCTA-1 -28.132584 24.925484 1256 ctrl 9 CD14+ Monocytes doublet 950
AAACATACCATGCA-1 -10.468194 -5.984389 1488 ctrl 3 CD4 T cells singlet 337
TTTGCATGCTAAGC-1 25.142392 6.603815 107 stim 6 CD4 T cells singlet 523
TTTGCATGGGACGA-1 14.359657 10.965601 1488 stim 6 CD4 T cells singlet 503
TTTGCATGGTGAGG-1 27.317997 7.933458 1488 stim 6 CD4 T cells ambs 448
TTTGCATGGTTTGG-1 13.744084 9.347784 1244 stim 6 CD4 T cells ambs 422
TTTGCATGTCTTAC-1 14.572118 -4.713942 1016 stim 5 CD4 T cells singlet 421

28871 rows × 8 columns

Preprocessing by scVI

Here we use scVI for batch correction. Running scVI on GPU will be much faster.

scvi.model.SCVI.setup_anndata(adata = adata, layer="counts",batch_key="stim")
model = scvi.model.SCVI(adata, n_latent=20)
model.train(max_epochs=400, early_stopping=True, early_stopping_patience=20)
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs



Training:   0%|          | 0/400 [00:00<?, ?it/s]


`Trainer.fit` stopped: `max_epochs=400` reached.
reconstructed = model.get_normalized_expression(n_samples=10, return_mean=True,library_size="latent")
reconstructed.iloc[:, :10]
GeneSymbol HES4 ISG15 TNFRSF18 TNFRSF4 MXRA8 ANKRD65 NADK PRKCZ FAM213B RP11-46F15.2
cell_id
AAACATACAATGCC-1 0.019033 0.210214 0.008682 0.012635 0.008505 1.966600e-07 0.007852 0.016129 0.056992 0.001184
AAACATACATTTCC-1 0.124047 1.775051 0.130992 0.141307 0.000300 5.500610e-03 0.187209 0.006264 0.024018 0.003484
AAACATACCAGAAA-1 0.039507 0.192389 0.193688 0.164865 0.001958 2.018716e-04 0.075346 0.010887 0.006039 0.000531
AAACATACCAGCTA-1 0.125339 1.100067 0.320633 0.149594 0.000137 4.099355e-05 0.098459 0.040244 0.003708 0.000075
AAACATACCATGCA-1 0.031996 0.497427 0.013899 0.041660 0.000922 3.332141e-06 0.008444 0.008473 0.000527 0.001109
TTTGCATGCTAAGC-1 0.022001 6.526540 0.022284 0.072436 0.000079 8.381066e-06 0.009712 0.011650 0.005869 0.000226
TTTGCATGGGACGA-1 0.007349 5.203463 0.011646 0.037932 0.000172 8.764559e-07 0.016608 0.011724 0.011683 0.001016
TTTGCATGGTGAGG-1 0.018921 4.360673 0.008313 0.025147 0.000127 3.003887e-06 0.004238 0.021204 0.007711 0.000681
TTTGCATGGTTTGG-1 0.020848 5.322153 0.016481 0.020105 0.000611 5.446282e-04 0.019620 0.004102 0.004913 0.001470
TTTGCATGTCTTAC-1 0.048356 5.451257 0.093828 0.541265 0.000064 7.868779e-06 0.010495 0.018504 0.020468 0.002103

28871 rows × 10 columns

adata.X = reconstructed.values
latent = model.get_latent_representation()
adata.obsm['X_scVI'] = latent
sc.pp.neighbors(adata, use_rep='X_scVI', n_neighbors=15)
sc.tl.umap(adata, min_dist=0.3)
sc.pl.umap(adata, color=['stim'])
png
sc.pl.umap(adata, color=['cell'])
png

MCA-based pathway inference

path_bg = 'data/c2.cp.v2025.1.Hs.symbols.gmt'
path_fg = 'data/B-cell_marker.txt'
gs = pd.read_table(path_fg, header=0).iloc[:, 0].tolist()
gsList = {'gs': gs}
bgList = fsc.read_gmt(path_bg)
grate = fsc.build_gene_rate(bg_list=bgList, gs_list=gsList)
grate
geneset background
MUC16 0.0 0.002982
SMYD1 0.0 0.000746
ZNF14 0.0 0.000497
DLGAP4 0.0 0.001243
ZZZ3 0.0 0.001243
UCK2 0.0 0.001740
ENTPD1 0.0 0.002485
RDH16 0.0 0.001740
USP48 0.0 0.000746
SLCO1B1 0.0 0.005716

13871 rows × 2 columns

adata = fsc.run_reduction(adata, n_dim=15, method="mca")
run mca...
mca step 1: Construct the Fuzzy Matrix and Row Weights
mca step 1 completed: Fuzzy Matrix and Row Weights constructed
svd started
svd completed
mca step 2: Calculate Cell Coordinates and Feature Loadings
mca step 2 completed: Cell Coordinates and Feature Loadings calculated
mca_scores = fsc.calculate_activity(adata, grate, method="mca", n_dim=12)
celltype_labels = adata.obs['cell'].copy()

result = plot_roc_curve(
    labels=celltype_labels,
    scores=mca_scores,
    target_celltype="B cells",
    method_name="MCA-based"
)
png

We continue to score the IFN pathway and test whether the scores are higher in stim condition compared to ctrl.

path_bg = 'data/c2.cp.v2025.1.Hs.symbols.gmt'
path_fg = 'data/IFN/INF_SIG.txt'
gs = pd.read_table(path_fg, header=0).iloc[:, 0].tolist()
gsList = {'gs': gs}
bgList = fsc.read_gmt(path_bg)
grate = fsc.build_gene_rate(bg_list=bgList, gs_list=gsList)
grate
geneset background
MUC16 0.0 0.002982
SMYD1 0.0 0.000746
ZNF14 0.0 0.000497
DLGAP4 0.0 0.001243
ZZZ3 0.0 0.001243
UCK2 0.0 0.001740
ENTPD1 0.0 0.002485
RDH16 0.0 0.001740
USP48 0.0 0.000746
SLCO1B1 0.0 0.005716

13868 rows × 2 columns

mca_scores = fsc.calculate_activity(adata, grate, method="mca",n_dim=12)
celltype_labels = adata.obs['stim'].copy()

result = plot_roc_curve(
    labels=celltype_labels,
    scores=mca_scores,
    target_celltype="stim",
    method_name="MCA-based"
)
png